This is our exploratory analysis for building a safety routing feature based on an AI model to predict accidents in the city of London. Our aim is to build a feature capable of helping people move with more confidence through a metropolis and optimizing routing by also considering accident rates. In this introduction, we will delve deeper into the dataset, explore key variables, and analyze patterns to inform our safety routing feature development.
import pandas as pd
import numpy as np
import networkx as nx
import plotly.express as px
import plotly.io as pio
import geopandas as gpd
from shapely.geometry import shape, Point, box
import osmnx as ox
import matplotlib.pyplot as plt
import seaborn as sns
import json
from folium.plugins import HeatMap
import pydeck as pdk
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.font_manager as fm
import networkx as nx
font_names = [f.name for f in fm.fontManager.ttflist]
sns.set(font="Proxima Nova")
import altair as alt
import calendar
import plotly.graph_objects as go
from itertools import chain
from plotly.subplots import make_subplots
import time
df = pd.read_csv('/Users/behemoth/Downloads/Road Accident Data.csv')
#changing the fetal into fatal data point -> spelling mistake identified in the dataset
df.loc[df['Accident_Severity'] == 'Fetal', 'Accident_Severity'] = 'Fatal'
Gathered the data within London's statistical geocoordinates as according to the London statistical office. This data includes the City of London and London's boroughs. Only consider the data points that fit within the coordinates.
# Load the GeoJSON file that contains the London boundary
with open('/Users/behemoth/Downloads/inner_london_polygons.json', 'r') as f:
geojson_data = json.load(f)
# Search for the London statistical boundary polygon in the GeoJSON file
statistical_boundary = None
for feature in geojson_data['features']:
if feature['properties'].get('definition') == 'statistical':
statistical_boundary = shape(feature['geometry'])
break
if statistical_boundary is None:
raise ValueError("Statistical boundary not found in GeoJSON file")
# Filter the DataFrame to contain only points within the statistical boundary
london_data = df[df.apply(lambda row: statistical_boundary.contains(Point(row['Longitude'], row['Latitude'])), axis = 1)]
Checking our dataset, we mostly have categorical variables. We consider any potential missing values and which columns we should focus our analysis on as a next step.
london_data
print(london_data.columns)
print(f"London accident data range from {london_data['Accident Date'].min()} to {london_data['Accident Date'].max()}")
Index(['Accident_Index', 'Accident Date', 'Day_of_Week', 'Junction_Control',
'Junction_Detail', 'Accident_Severity', 'Latitude', 'Light_Conditions',
'Local_Authority_(District)', 'Carriageway_Hazards', 'Longitude',
'Number_of_Casualties', 'Number_of_Vehicles', 'Police_Force',
'Road_Surface_Conditions', 'Road_Type', 'Speed_limit', 'Time',
'Urban_or_Rural_Area', 'Weather_Conditions', 'Vehicle_Type'],
dtype='object')
London accident data range from 1/1/2021 to 9/9/2022
missing_values = london_data.isnull().sum()
missing_values
Accident_Index 0 Accident Date 0 Day_of_Week 0 Junction_Control 0 Junction_Detail 0 Accident_Severity 0 Latitude 0 Light_Conditions 0 Local_Authority_(District) 0 Carriageway_Hazards 22814 Longitude 0 Number_of_Casualties 0 Number_of_Vehicles 0 Police_Force 0 Road_Surface_Conditions 0 Road_Type 2 Speed_limit 0 Time 0 Urban_or_Rural_Area 0 Weather_Conditions 66 Vehicle_Type 0 dtype: int64
We drop carriage way Hazards as it doesn't apply to our London data and is an empty column,
# df['Carriageway_Hazards'].unique()
# drop Carriageway_Hazards
london_data = london_data.drop('Carriageway_Hazards', axis = 1)
london_data.shape
(22997, 20)
summary_statistics = london_data.describe()
summary_statistics
| Latitude | Longitude | Number_of_Casualties | Number_of_Vehicles | Speed_limit | |
|---|---|---|---|---|---|
| count | 22997.000000 | 22997.000000 | 22997.000000 | 22997.000000 | 22997.000000 |
| mean | 51.507395 | -0.105841 | 1.151759 | 1.728399 | 30.202200 |
| std | 0.041878 | 0.067194 | 0.492170 | 0.581562 | 1.964635 |
| min | 51.412793 | -0.256464 | 1.000000 | 1.000000 | 20.000000 |
| 25% | 51.476704 | -0.151853 | 1.000000 | 1.000000 | 30.000000 |
| 50% | 51.510918 | -0.110755 | 1.000000 | 2.000000 | 30.000000 |
| 75% | 51.533318 | -0.066081 | 1.000000 | 2.000000 | 30.000000 |
| max | 51.609266 | 0.079620 | 11.000000 | 9.000000 | 70.000000 |
A box and whisker plot was then created for the numerical attributes speed limit, number of causalities and number of vehicles in order to understand their distribution. The plot shows that most recorded accidents had a speed limit of around 30, however there are some outliers including a speed limit of 70 and 20. The plot further shows the number of causalities of the recorded accidents is concentrated at 1, however a few outliers are shown by points positioned above the box. Moreover, the number of vehicles involved in the accidents are mostly distributed at either one or two.
attributes_to_plot = ['Speed_limit', 'Number_of_Casualties', 'Number_of_Vehicles']
fig, axes = plt.subplots(1, len(attributes_to_plot), figsize=(16, 6))
# Customize plots
for i, col in enumerate(attributes_to_plot):
sns.boxplot(data=london_data, y=col, ax=axes[i],
color='#4285F4', linewidth=2, whis=50) # Google Blue color
axes[i].set_ylim(0, None)
axes[i].set_ylabel(col, color='grey', fontsize=14, fontweight='bold')
axes[i].set_title(f'Box Plot for {col}', color='grey', fontsize=16, fontweight='bold')
axes[i].spines['top'].set_visible(False)
axes[i].spines['right'].set_visible(False)
axes[i].tick_params(colors='grey', labelsize=14)
fig.patch.set_facecolor('none') # Transparent background
plt.tight_layout()
plt.show()
This code defines a function to preprocess data by converting date and time columns to datetime, extracting features, encoding categorical variables, handling missing numerical values, and dropping unnecessary columns. This is the data we'll be using for the random tree model.
def preprocess_data(data):
# Load the dataset
# Convert 'Accident Date' and 'Time' to datetime and extract features
data['Accident Date'] = pd.to_datetime(data['Accident Date'], errors='coerce')
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M', errors='coerce').dt.time
data['Hour'] = pd.to_datetime(data['Time'].astype(str), errors='coerce').dt.hour
data['Month'] = data['Accident Date'].dt.month
data['Year'] = data['Accident Date'].dt.year
data['Day_of_Week'] = data['Accident Date'].dt.dayofweek
# Drop the original 'Accident Date' and 'Time' columns
data.drop(['Accident Date', 'Time'], axis=1, inplace=True)
# Encode categorical variables and save the mappings
label_encoders = {}
label_mappings = {}
for col in data.select_dtypes(include=['object']).columns.drop('Accident_Index'):
label_encoders[col] = LabelEncoder()
data[col] = label_encoders[col].fit_transform(data[col])
label_mappings[col] = {index: label for index, label in enumerate(label_encoders[col].classes_)}
# Handle missing numerical values
for col in data.select_dtypes(include=['int64', 'float64']).columns:
data[col].fillna(data[col].median(), inplace=True)
return data, label_mappings
# Apply preprocessing
data_copy = london_data.copy()
preprocessed_data, category_mappings = preprocess_data(data_copy)
# Display the preprocessed data
preprocessed_data.head()
/var/folders/zp/8608g3pd2td58x042d94m2sw0000gn/T/ipykernel_20671/2184440651.py:7: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format. data['Hour'] = pd.to_datetime(data['Time'].astype(str), errors='coerce').dt.hour
| Accident_Index | Day_of_Week | Junction_Control | Junction_Detail | Accident_Severity | Latitude | Light_Conditions | Local_Authority_(District) | Longitude | Number_of_Casualties | ... | Police_Force | Road_Surface_Conditions | Road_Type | Speed_limit | Urban_or_Rural_Area | Weather_Conditions | Vehicle_Type | Hour | Month | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 200901BS70001 | 4 | 3 | 8 | 1 | 51.512273 | 4 | 13 | -0.201349 | 1 | ... | 1 | 0 | 1 | 30 | 0 | 1 | 2 | 15 | 1 | 2021 |
| 1 | 200901BS70002 | 1 | 3 | 0 | 1 | 51.514399 | 4 | 13 | -0.199248 | 11 | ... | 1 | 4 | 3 | 30 | 0 | 1 | 11 | 10 | 1 | 2021 |
| 2 | 200901BS70003 | 0 | 3 | 8 | 2 | 51.486668 | 4 | 13 | -0.179599 | 1 | ... | 1 | 0 | 3 | 30 | 0 | 1 | 11 | 14 | 1 | 2021 |
| 3 | 200901BS70004 | 1 | 1 | 8 | 1 | 51.507804 | 4 | 13 | -0.203110 | 1 | ... | 1 | 2 | 3 | 30 | 0 | 3 | 9 | 8 | 1 | 2021 |
| 4 | 200901BS70005 | 2 | 1 | 0 | 1 | 51.482076 | 1 | 13 | -0.173445 | 1 | ... | 1 | 0 | 3 | 30 | 0 | 1 | 2 | 17 | 1 | 2021 |
5 rows × 21 columns
This analysis helps in understanding feature importance for predicting accident severity.
data = preprocessed_data.copy()
# Define features and target variable
X = data.drop(['Accident_Severity', 'Accident_Index'], axis=1)
y = data['Accident_Severity']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Initialize RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Predictions
y_pred = model.predict(X_test)
# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
# Feature importances
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
# Splitting feature importances by categories
location_features = ['Latitude', 'Longitude']
time_features = ['Hour', 'Month', 'Year', 'Day_of_Week']
weather_road_features = ['Junction_Control', 'Junction_Detail', 'Road_Surface_Conditions', 'Road_Type', 'Speed_limit', 'Urban_or_Rural_Area', 'Weather_Conditions']
# Plotting feature importances with color-coded groups
plt.figure(figsize=(12, 8))
feature_colors = ['#4285F4' if col in location_features else '#34A853' if col in time_features else '#FBBC05' for col in feature_importances.index]
sns.barplot(x=feature_importances, y=feature_importances.index, palette=feature_colors)
plt.title("Feature Importances in Predicting Accident Severity", fontname='Proxima Nova')
plt.xlabel("Importance") # based on Gini index /entropy decrease or information gain
plt.ylabel("Features")
plt.show()
Accuracy: 0.8594202898550725
Classification Report:
precision recall f1-score support
0 0.59 0.16 0.25 118
1 0.25 0.00 0.01 855
2 0.86 1.00 0.92 5927
accuracy 0.86 6900
macro avg 0.57 0.39 0.39 6900
weighted avg 0.78 0.86 0.80 6900
/var/folders/zp/8608g3pd2td58x042d94m2sw0000gn/T/ipykernel_20671/3886745045.py:34: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
Now we dig deeper on analysing the features based on our three core research questions as an initial analysis. Continuing this exploration, we will first focus on understanding our data features to build more appropriate visualizations for each research question. Here, we begin by examining the location data.
The majority of accidents are classified as 'Slight', with fewer incidents categorized as 'Serious' or 'Fatal'.
print(london_data['Accident_Severity'].value_counts().index)
Index(['Slight', 'Serious', 'Fatal'], dtype='object', name='Accident_Severity')
sev_counts = london_data['Accident_Severity'].value_counts().reset_index()
sev_counts.columns = ['Severity', 'Count']
# Calculate percentage
sev_counts['Percentage'] = sev_counts['Count'] / sev_counts['Count'].sum() * 100
# Create Plotly bar chart with percentage
fig = px.bar(sev_counts, x='Severity', y='Count',
color='Severity', color_discrete_sequence=['#34A853','#FBBC05','#EA4335'],
category_orders={'Accident_Severity': ['Slight', 'Serious', 'Fatal']},
template='simple_white', title='<b>Accident Severity in London</b>')
# Update layout for modern style
fig.update_layout(yaxis_gridcolor='#F0F2F6', xaxis={'categoryorder':'total descending'},
plot_bgcolor='rgba(0,0,0,0)', height=400, width = 600,
hoverlabel=dict(font_size=14))
fig.update_layout(font=dict(family="Product Sans, sans-serif"))
# Add percentage above the bar chart
for i in range(len(sev_counts)):
fig.add_annotation(x=sev_counts['Severity'][i], y=sev_counts['Count'][i] +1020,
text=f"{sev_counts['Percentage'][i]:.1f}%", showarrow=False, font=dict(size=12))
fig.show()
After analyzing the scatterplot based on accident severity, it was observed that the majority of fatal accidents tend to occur closer to the city center. Conversely, a significant portion of accidents reported only slight severity. This insight sheds light on the distribution of accident severity in relation to the geographical location within the city.
zoom_level = 10.2
# Create a scatter mapbox to represent each point on the map
fig = px.scatter_mapbox(
london_data,
lat='Latitude',
lon='Longitude',
color='Accident_Severity',
color_discrete_sequence=['#34A853','#FBBC05','#EA4335'],
category_orders={'Accident_Severity': ['Slight', 'Serious', 'Fatal']},
zoom=zoom_level,
center={"lat": 51.5074, "lon": -0.1278},
height=650,
width=1000,
title='<b>Accident Severity in London</b>',
template='simple_white'
)
# Use open-street-map which does not require Mapbox Token
fig.update_layout(mapbox_style="open-street-map")
#layout properties
fig.update_layout(
font=dict(family="Product Sans, sans-serif"),
margin=dict(l=0, r=0, t=30, b=0),
)
fig.update_traces(marker=dict(size=5))
fig.update_layout(
showlegend=True,
mapbox=dict(
bearing=0,
pitch=0,
zoom=zoom_level,
center=dict(lat=51.5074, lon=-0.1278)
),
)
fig.write_image("/Users/behemoth/Downloads/visualization_output.svg")
fig.show()
severity_weights = {'no accident': 1, 'slight': 0.75, 'serious': 0.5, 'fatal': 0.25}
london_data['Severity_Weight'] = london_data['Accident_Severity'].map(severity_weights)
# Define zoom level
zoom_level = 10.75
# Create a density mapbox to represent the severity weight on the map
fig = px.density_mapbox(
london_data,
lat='Latitude',
lon='Longitude',
z='Severity_Weight',
radius=5, # Decreased the radius for less density
center={"lat": 51.5074, "lon": -0.1278},
zoom=zoom_level,
mapbox_style="carto-positron",
height=650,
color_continuous_scale= 'viridis',
width=1000,
title='<b>Accident Severity Weights in London</b>',
)
# Update layout properties
fig.update_layout(
font=dict(family="Product Sans, sans-serif"),
margin=dict(l=0, r=0, t=50, b=0),
)
fig.update_layout(
showlegend=False,
mapbox=dict(
bearing=0,
pitch=0,
zoom=zoom_level,
center=dict(lat=51.5074, lon=-0.1278),
layers=[], # Update this accordingly if you want to add more layers.
),
)
fig.update_traces(opacity=0.6)
fig.write_image("/Users/behemoth/Downloads/accident_severity_heatmap.svg")
fig.show()
We are analyzing accident data in London to build a safety routing feature for Google Maps. This involves mapping locations with high accident rates. However, we encountered an issue regarding causality and the relationship between traffic and accidents. Simply mapping areas with many accidents may highlight locations with heavy traffic rather than dangerous intersections. We need to consider the correlation between traffic and accidents at specific locations.
For this reason we'll be looking into traffic data from the london transport office
We start by donwloading the data set from the London department for transport website https://roadtraffic.dft.gov.uk/regions/6. Our data set represents the Annual Average Daily Flow by location. Amongst the features of our data set, since the accident data we have concerns motor vehicles, we'll be looking at the latitude, longitutude and all_motor_vehicles columns.
#df_countpoints = pd.read_csv('/Users/behemoth/Downloads/Countpoints Region 6.csv')
df_regions = pd.read_csv('/Users/behemoth/Downloads/dft_aadf_region_id_6.csv')
print("\nColumn names of df_regions:")
print(df_regions.columns)
df_regions.head()
Column names of df_regions:
Index(['count_point_id', 'year', 'region_id', 'region_name',
'local_authority_id', 'local_authority_name', 'road_name', 'road_type',
'start_junction_road_name', 'end_junction_road_name', 'easting',
'northing', 'latitude', 'longitude', 'link_length_km',
'link_length_miles', 'estimation_method', 'estimation_method_detailed',
'pedal_cycles', 'two_wheeled_motor_vehicles', 'cars_and_taxis',
'buses_and_coaches', 'lgvs', 'hgvs_2_rigid_axle', 'hgvs_3_rigid_axle',
'hgvs_4_or_more_rigid_axle', 'hgvs_3_or_4_articulated_axle',
'hgvs_5_articulated_axle', 'hgvs_6_articulated_axle', 'all_hgvs',
'all_motor_vehicles'],
dtype='object')
| count_point_id | year | region_id | region_name | local_authority_id | local_authority_name | road_name | road_type | start_junction_road_name | end_junction_road_name | ... | buses_and_coaches | lgvs | hgvs_2_rigid_axle | hgvs_3_rigid_axle | hgvs_4_or_more_rigid_axle | hgvs_3_or_4_articulated_axle | hgvs_5_articulated_axle | hgvs_6_articulated_axle | all_hgvs | all_motor_vehicles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6750 | 2014 | 6 | London | 96 | Islington | A201 | Major | Swinton St | Pentonville Rd | ... | 787 | 1728 | 204 | 74 | 132 | 15 | 6 | 18 | 449 | 11916 |
| 1 | 8352 | 2014 | 6 | London | 104 | Lewisham | A2 | Major | A20 | A20 | ... | 430 | 1415 | 258 | 36 | 58 | 6 | 11 | 22 | 390 | 9426 |
| 2 | 6807 | 2014 | 6 | London | 178 | Kingston upon Thames | A240 | Major | A3210 | A307 | ... | 175 | 1589 | 251 | 35 | 13 | 2 | 4 | 19 | 325 | 12951 |
| 3 | 27765 | 2014 | 6 | London | 57 | Barnet | A5109 | Major | Barnet Lane | A1000/A109 | ... | 302 | 1651 | 246 | 12 | 12 | 0 | 1 | 1 | 273 | 19276 |
| 4 | 8488 | 2014 | 6 | London | 104 | Lewisham | A20 | Major | A20 Amersham Rd | A2 Parkfield Rd | ... | 656 | 1617 | 286 | 83 | 27 | 5 | 8 | 7 | 415 | 12821 |
5 rows × 31 columns
We only consider the data between 2021 and 2022 so we verify our current range and drop the values that aren't contained within 2021 and 2022.
year_range = df_regions['year'].min(), df_regions['year'].max()
print("Range of data in terms of year:", year_range)
df_regions_2122 = df_regions[(df_regions['year'] >= 2021) & (df_regions['year'] <= 2022)]
Range of data in terms of year: (2000, 2022)
At this point we also need only the data that is within London's statistical boundaries so we use the same geopanda file we'd used before and drop any values further away from our polygon.
df_regions_2122 = df_regions_2122[df_regions_2122.apply(lambda row: statistical_boundary.contains(Point(row['longitude'], row['latitude'])), axis = 1)]
print("Missing values in df_2021_2022:")
print(df_regions_2122.isnull().sum())
df_regions_2122.shape
Missing values in df_2021_2022: count_point_id 0 year 0 region_id 0 region_name 0 local_authority_id 0 local_authority_name 0 road_name 0 road_type 0 start_junction_road_name 110 end_junction_road_name 110 easting 0 northing 0 latitude 0 longitude 0 link_length_km 106 link_length_miles 106 estimation_method 0 estimation_method_detailed 0 pedal_cycles 0 two_wheeled_motor_vehicles 0 cars_and_taxis 0 buses_and_coaches 0 lgvs 0 hgvs_2_rigid_axle 0 hgvs_3_rigid_axle 0 hgvs_4_or_more_rigid_axle 0 hgvs_3_or_4_articulated_axle 0 hgvs_5_articulated_axle 0 hgvs_6_articulated_axle 0 all_hgvs 0 all_motor_vehicles 0 dtype: int64
(1840, 31)
The traffic data for 2021 and 2022 is relatively limited compared to the accident data collected. This limitation is attributed to the data collection process. Considering the significant impact of data collection dates on our dataset, particularly during the Covid restrictions in 2021, we have decided not to include data points from outside the range of the accident dataset.
# Count of unique latitude and longitude rows in London data
unique_lat_long_london = london_data[['Latitude', 'Longitude']].drop_duplicates().shape[0]
print("Count of unique latitude and longitude rows in London data:", unique_lat_long_london)
# Count of unique latitude and longitude rows in df_regions_2122
unique_lat_long_df_regions_2122 = df_regions_2122[['latitude', 'longitude']].drop_duplicates().shape[0]
print("Count of unique latitude and longitude rows in df_regions_2122:", unique_lat_long_df_regions_2122)
Count of unique latitude and longitude rows in London data: 18154 Count of unique latitude and longitude rows in df_regions_2122: 924
One of the main differences from our traffic data is that we have fewer data points by latitude and longitude for traffic compared to accidents. When we map a heatmap of traffic in london, it's easy to note that areas in the city center where more accidents tend to happen are also the most trafficked, but we also note that towards East London, closer to Canning Town and the Greenwich Peninsula, there is also a much higher rate of traffic, potentially due to the commuters' influx into the city and river crossings in that area.
Even with fewer accident data points, the spatial correlation between high traffic and accident-prone areas is clear in Central London. However, the East London anomaly indicates traffic alone does not fully explain accident rates.
# Define zoom level
zoom_level = 10.75
# Create a density mapbox to represent the severity weight on the map
fig = px.density_mapbox(
df_regions_2122,
lat='latitude',
lon='longitude',
z='all_motor_vehicles',
radius=10,
center={"lat": 51.5074, "lon": -0.1278},
zoom=zoom_level,
mapbox_style="carto-positron",
height=650,
color_continuous_scale= 'viridis',
width=1000,
title='<b>Accident Severity Weights in London</b>',
)
# Update layout properties
fig.update_layout(
font=dict(family="Product Sans, sans-serif"),
margin=dict(l=0, r=0, t=50, b=0),
)
fig.update_layout(
showlegend=False,
mapbox=dict(
bearing=0,
pitch=0,
zoom=zoom_level,
center=dict(lat=51.5074, lon=-0.1278),
layers=[], # Update this accordingly if you want to add more layers.
),
)
fig.update_traces(opacity=0.6)
fig.write_image("/Users/behemoth/Desktop/visualisation_output.svg")
fig.show()
The heatmap visualization showed high correlation between traffic and accidents in Central London as expected, with the busiest areas also seeing more accidents. However, East London emerged as an anomaly, with high traffic but disproportionately higher accidents not fully explained by volume alone.
With fewer granular accident data points compared to traffic, we have to be cautious about overinterpreting spatial patterns. The clustering in Central London likely reflects general correlation of accidents with traffic volume, but East London indicates other factors at play as well.
In this case due to lack of traffic data we'll just note that to further improve our model, and assess properly the potential for higher accident rates at specific locations we'll need to collect more data on traffic in London. This could allow us to calculate accident rates normalized by traffic to better identify locations with higher risk. Along with spatial analysis techniques, we can then further analyze the complex interplay of road infrastructure, usage patterns and safety outcomes.
We leveraged osmnx to map the London road network and visualize hotspots based on accident counts. The network graph representation enables optimized safety routing to avoid high-risk areas in the future. Beyond visualizing past data, the versatile network allows us to simulate changes to model the impact on safety scores for different routing approaches when we'll run further analysis on the model in the future.
G1 = ox.graph_from_place("London, England", network_type="drive")
G2 = ox.graph_from_place("City of London, England", network_type="drive")
G = nx.compose(G1, G2)
gdf_nodes, gdf_edges = ox.utils_graph.graph_to_gdfs(G)
gdf_london = gpd.GeoDataFrame(
london_data,
geometry=gpd.points_from_xy(london_data["Longitude"], london_data["Latitude"]),
crs="EPSG:4326"
).to_crs("epsg:4326")
north = london_data['Latitude'].max()
south = london_data['Latitude'].min()
east = london_data['Longitude'].max()
west = london_data['Longitude'].min()
# Create a GeoDataFrame with the bounding box polygon
geometry = box(west, south, east, north)
gdf_poly = gpd.GeoDataFrame({'geometry': [geometry]})
# reduce accidents down to those in investigated location
gdf_loc = gdf_london.sjoin(gdf_poly)
G_proj = ox.project_graph(G)
gdf_loc_p = gdf_loc["geometry"].to_crs(G_proj.graph["crs"])
/Users/behemoth/anaconda3/lib/python3.11/site-packages/geopandas/geodataframe.py:2187: UserWarning: CRS mismatch between the CRS of left geometries and the CRS of right geometries. Use `to_crs()` to reproject one of the input geometries to match the CRS of the other. Left CRS: EPSG:4326 Right CRS: None
ne, d = ox.nearest_edges(
G_proj, X=gdf_loc_p.x.values, Y=gdf_loc_p.y.values, return_dist=True
)
# reindex points based on results from nearest_edges
gdf_loc = (
gdf_loc.set_index(pd.MultiIndex.from_tuples(ne, names=["u", "v", "key"]))
.assign(distance=d)
.sort_index()
)
gdf_loc.head()
| Accident_Index | Accident Date | Day_of_Week | Junction_Control | Junction_Detail | Accident_Severity | Latitude | Light_Conditions | Local_Authority_(District) | Longitude | ... | Road_Type | Speed_limit | Time | Urban_or_Rural_Area | Weather_Conditions | Vehicle_Type | Severity_Weight | geometry | index_right | distance | |||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| u | v | key | |||||||||||||||||||||
| 99936 | 4544836433 | 0 | 200901CW10175 | 1/1/2021 | Thursday | Auto traffic signal | Crossroads | Serious | 51.523932 | Darkness - lights lit | Westminster | -0.153030 | ... | Single carriageway | 30 | 17:30 | Urban | Fine no high winds | Motorcycle over 500cc | NaN | POINT (-0.15303 51.52393) | 0 | 6.612032 |
| 0 | 200901CW10476 | 3/20/2021 | Friday | Data missing or out of range | Not at junction or within 20 metres | Slight | 51.523660 | Darkness - lights lit | Westminster | -0.152897 | ... | Dual carriageway | 30 | 23:30 | Urban | Fine no high winds | Car | NaN | POINT (-0.15290 51.52366) | 0 | 5.676939 | ||
| 0 | 200901CW11120 | 5/20/2021 | Wednesday | Auto traffic signal | Crossroads | Slight | 51.523842 | Daylight | Westminster | -0.153033 | ... | Single carriageway | 30 | 18:00 | Urban | Fine no high winds | Car | NaN | POINT (-0.15303 51.52384) | 0 | 9.450222 | ||
| 0 | 200901CW11136 | 6/12/2021 | Friday | Auto traffic signal | Crossroads | Slight | 51.523930 | Daylight | Westminster | -0.152886 | ... | Single carriageway | 30 | 12:15 | Urban | Fine no high winds | Motorcycle over 500cc | NaN | POINT (-0.15289 51.52393) | 0 | 2.971498 | ||
| 0 | 201001CW10726 | 4/21/2022 | Wednesday | Auto traffic signal | Crossroads | Slight | 51.523932 | Daylight | Westminster | -0.153030 | ... | Single carriageway | 30 | 18:19 | Urban | Fine no high winds | Bus or coach (17 or more pass seats) | NaN | POINT (-0.15303 51.52393) | 0 | 6.612032 |
5 rows × 24 columns
In this part we're joining our geometry df to generate a summary DataFrame that groups the number of accidents in the correct location of the network.
# Join gdf_bad_roads with gdf_edges to access road segment lengths
gdf_joined = (
gdf_edges.join(gdf_loc, rsuffix="_loc", how="inner")
.groupby(["u", "v", "key"])
.agg(geometry=("geometry", "first"), number=("osmid", "size"))
)
# Calculate accident rate per unit length
gdf_joined["rate"] = gdf_joined["number"] / gdf_joined["geometry"].length
# Ensure the active geometry column is set to 'geometry'
gdf_joined = gdf_joined.set_geometry('geometry')
# Generate a summary DataFrame grouping the number of accidents in gdf_joined
summary_df = gdf_joined.groupby("number").size().reset_index(name="count")
/Users/behemoth/anaconda3/lib/python3.11/site-packages/pandas/core/ops/array_ops.py:82: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
Binning the number of accidents into categories based on the updated criteria:
We also added an additional normalisation by log scale which we can visualise when hovering over the data points on our map.
bins = [0, 1, 3, 8, float("inf")]
bin_labels = ["Low", "Moderate", "High", "Very High"]
# Create a new column for binning based on the updated criteria
gdf_joined["Accident_count_binned"] = pd.cut(gdf_joined["number"], bins=bins, labels=bin_labels)
# Create a new column for normalized rate
gdf_joined["normalized_rate"] = np.log1p(gdf_joined["rate"])
# Display statistics for the normalized accident rate
gdf_joined["normalized_rate"].describe()
top_10_roads = gdf_joined.nlargest(10, 'number')
m = folium.Map(tiles=None, location=[51.5074, -0.1278], zoom_start = 12.5)
folium.TileLayer(
name='Mapbox',
location=[51.5074, -0.1278],
zoom_start=12,
tiles='https://api.mapbox.com/styles/v1/laurazecca/clsditq1f01rx01qy67pm2040/tiles/256/{z}/{x}/{y}@2x?access_token=pk.eyJ1IjoibGF1cmF6ZWNjYSIsImEiOiJjbHMyd3gzZncwbnphMnBtZTJkYjhkMmlxIn0.WxR8PVytlUu4ksVRqySjrQ',
attr='mapbox',
show=True).add_to(m)
# Create a layer group for markers
marker_layer = folium.FeatureGroup(name="Accident Markers")
# Add markers with numbers to the marker layer
for idx, road in top_10_roads.iterrows():
folium.Marker(
location=[road.geometry.centroid.y, road.geometry.centroid.x],
popup=str(road['number']),
icon=folium.Icon(color='orange', icon= 'exclamation-triangle', prefix='fa')
).add_to(marker_layer)
# Add base map layer
#base_map_layer = folium.FeatureGroup(name="")
marker_layer.add_to(m)
m = gdf_joined.explore(
m=m, column="Accident_count_binned", cmap="plasma", name="Accident roads",
weight = 6,
opacity= 0.5)
heatmap_data = [[point.xy[1][0], point.xy[0][0]] for point in gdf_joined.geometry ]
heat_maplayer = folium.FeatureGroup(name="Heat Map")
# Create and add a HeatMap layer
heat_map = HeatMap(
heatmap_data,
min_opacity=0.1,
max_opacity=0.6, # Adjust max opacity for overall intensity
radius=10,
blur=15,
gradient={0.4: 'blue', 0.65: 'lime', 1: 'red'}
).add_to(heat_maplayer)
heat_maplayer.add_to(m)
folium.LayerControl(collapsed=False).add_to(m)
m
# Convert 'Accident Date' to datetime format
london_data['Accident Date'] = pd.to_datetime(london_data['Accident Date'])
# Convert 'Time' to datetime format and extract hour and minute
london_data['Time'] = pd.to_datetime(london_data['Time'])
# Extract additional time-related features
london_data['Day_of_Week']
london_data['Month'] = london_data['Accident Date'].dt.month
london_data['Year'] = london_data['Accident Date'].dt.year
london_data['Hour'] = london_data['Time'].dt.hour
# warning messages are fine for our purpose
/var/folders/zp/8608g3pd2td58x042d94m2sw0000gn/T/ipykernel_20671/4005492272.py:5: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
# Filter the DataFrame by year
data_2021 = london_data[london_data['Year'] == 2021]
data_2022 = london_data[london_data['Year'] == 2022]
There is a rising number of accidents from the winter to summer months and accidents peak in October.
# using altair
# aggregating data by month and year
monthly_data = london_data.groupby(['Month', 'Year']).size().reset_index(name='Count')
# average number of accidents
average_accidents = monthly_data.groupby('Month')['Count'].mean().reset_index(name='Average')
monthly_data = pd.merge(monthly_data, np.round(average_accidents), on='Month', how='left')
# Stacked bar chart
chart = alt.Chart(monthly_data).mark_bar().encode(
x=alt.X('Month:O', title='Month', axis=alt.Axis(labelAngle=0,labelFontSize=14,titleFontSize=16, titlePadding = 20)),
y=alt.Y('sum(Count):Q', title='Number of Accidents', axis=alt.Axis(labelFontSize=14,titleFontSize=16, titlePadding = 20)),
color=alt.Color('Year:N', title='Year', scale=alt.Scale(range=['#4285F4', '#34A853'])),
tooltip=['Year:O',
'Month:O',
alt.Tooltip('sum(Count)', title='Number of Accidents'),
alt.Tooltip('Average', title='Average Number of Accidents')]
).properties(
width=600,
height=400,
title=alt.TitleParams(text='Number of Accidents by Month for 2021 and 2022', fontSize=20, font='Product Sans')
).configure_legend(
titleFontSize=18,
labelFontSize=16
)
chart
Accidents increase steadily from Monday to Friday, peaking on Fridays.
# Aggregating data by day of week
daily_data = london_data.groupby(['Day_of_Week', 'Year']).size().reset_index(name='Count')
# average number of accidents
average_accidents = daily_data.groupby('Day_of_Week')['Count'].mean().reset_index(name='Average')
daily_data = pd.merge(daily_data, np.round(average_accidents), on='Day_of_Week', how='left')
# order of days of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# stacked bar chart
chart = alt.Chart(daily_data).mark_bar().encode(
x=alt.X('Day_of_Week:N', title='Day of Week', sort=day_order,axis=alt.Axis(labelAngle=0,labelFontSize=14,titleFontSize=16, titlePadding = 20)),
y=alt.Y('sum(Count):Q', title='Number of Accidents',axis=alt.Axis(labelFontSize=14,titleFontSize=16, titlePadding = 20)), # Sum of counts for each day
color=alt.Color('Year:N', title='Year',scale=alt.Scale(range=['#4285F4', '#34A853'])),
tooltip=['Year:O',
alt.Tooltip('Day_of_Week:N', title = 'Day of Week'),
alt.Tooltip('sum(Count)', title='Number of Accidents'),
alt.Tooltip('Average', title='Average Number of Accidents')]
).properties(
width=600,
height=400,
title=alt.TitleParams(text='Number of Accidents by Day of Week for 2021 and 2022', font='Product Sans', fontSize=20)
).configure_legend(
titleFontSize = 18,
labelFontSize = 16
)
chart
Analyzing accidents by time of day shows peak hours around 8am and 6pm, aligning with rush hour traffic. We binned times into 24 hourly periods to see this daily pattern. Friday and Saturday nights also see increased late night accidents, so just looking at weekday versus weekend totals can obscure risky nightlife hours.
# aggregating data for hour of day
hourly_data = london_data.groupby(['Hour', 'Year']).size().reset_index(name='Count')
hour_order = list(range(4, 24)) + list(range(0, 4))
# stacked bar chart
chart = alt.Chart(hourly_data).mark_bar().encode(
x=alt.X('Hour:O', title='Hour of the Day',sort=hour_order,axis=alt.Axis(labelAngle=0,labelFontSize=14,titleFontSize=16, titlePadding = 20)),
y=alt.Y('sum(Count):Q', title='Number of Accidents',axis=alt.Axis(labelFontSize=14,titleFontSize=16, titlePadding = 20)),
color=alt.Color('Year:N', title='Year',scale=alt.Scale(range=['#4285F4', '#34A853'])),
tooltip=['Year:O','Hour:O', alt.Tooltip('sum(Count)', title='Number of Accidents')]
).properties(
width=600,
height=400,
title=alt.TitleParams(text='Number of Accidents by Hour of the Day for 2021 and 2022', fontSize=20, font = 'Product Sans')
).configure_legend(
titleFontSize = 18,
labelFontSize = 16
)
chart
# Create a pivot table to aggregate the count of accidents by day of the week and month
pivot_table_heatmap = london_data.pivot_table(index='Month', columns='Day_of_Week', aggfunc='size', fill_value=0)
pivot_table_heatmap = round(pivot_table_heatmap / 2).astype(int)
# Reorder the columns by the logical order of weekdays
pivot_table = pivot_table_heatmap[['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']]
# Plotting the heatmap with swapped axes
plt.figure(figsize=(10, 6))
ax = sns.heatmap(pivot_table[::-1], cmap='Blues', annot=True,fmt = 'd', linewidths=.5)
# Set the title of the colorbar
colorbar = ax.collections[0].colorbar
colorbar.set_label('Number of Accidents', rotation=270, labelpad=15)
plt.title('Average Accident Count by Day of Week and Month')
plt.xlabel('Day of Week')
plt.ylabel('Month')
#plt.savefig('RQ2_4.svg', format='svg')
plt.show()
The dynamic heatmap visualizes accident frequency over hour, weekday, and month. Darker cells show more frequent historical crashes. We can incorporate these insights into the algorithm calculating user alert types and timing to promote safety awareness.
# pivot table for each hour
hourly_pivot_tables = {}
for hour in london_data['Hour'].unique():
filtered_data = london_data[london_data['Hour'] == hour]
pivot_table = filtered_data.pivot_table(index='Month', columns='Day_of_Week',
values='Accident_Index', aggfunc='count', fill_value=0)
pivot_table = pivot_table[day_order]
hourly_pivot_tables[hour] = pivot_table
# sort by hour
sorted_hourly_pivot_tables = {}
for hour in sorted(hourly_pivot_tables.keys()):
sorted_hourly_pivot_tables[hour] = hourly_pivot_tables[hour]
# month names in reverse order
month_names = [calendar.month_name[i] for i in range(12, 0, -1)]
# Create the base figure for the subplot
fig = make_subplots(rows=1, cols=1)
# Determine the overall range of values in all pivot tables
zmin = min([pivot.min().min() for pivot in sorted_hourly_pivot_tables.values()])
zmax = max([pivot.max().max() for pivot in sorted_hourly_pivot_tables.values()])
# Add a heatmap for each hour and initially hide them, except for the first one
for hour, pivot_table in sorted_hourly_pivot_tables.items():
fig.add_trace(
go.Heatmap(
z=pivot_table,
x=pivot_table.columns,
y=month_names, # Use reverse month names for y-axis labels
colorscale='Blues',
colorbar=dict(title='Average Number of Accidents'),
showscale=True,
name=f'Hour {hour}',
zmin=zmin, # Set the minimum value for the color scale
zmax=zmax, # Set the maximum value for the color scale
hovertemplate='Week of Day: %{x}<br>Month: %{y}<br>Average Accident Count: %{z}<extra></extra>'
),
row=1, col=1
)
fig.data[-1].visible = False
# Make the first hour visible
fig.data[0].visible = True
# Hour slider
steps = []
for i, hour in enumerate(sorted_hourly_pivot_tables.keys()):
step = dict(
method='update',
args=[{'visible': [False] * len(fig.data)},
{'title': f'Accidents by Hour, Day of Week, and Month: Hour {hour}'}],
label=f'{hour}'
)
step['args'][0]['visible'][i] = True
steps.append(step)
sliders = [dict(
active=0,
currentvalue={"prefix": "Hour: "},
pad={"t": 20},
steps=steps
)]
fig.update_layout(
sliders=sliders,
title='Dynamic Heatmap of Average Accident Count by Hour, Day of Week, and Month',
xaxis_title='Day of Week',
yaxis_title='Month',
height=700,
width=900,
xaxis_side='top',
margin=dict(t=170),
font=dict(family='Product Sans')
)
fig.show()
The heatmap shows weekday rush hours and weekdays overall have significantly higher accidents. Based on these historical patterns, our alerts notify users of high risk conditions to promote caution and confidence in navigation. We aim to distill data insights into clear safety awareness.
# Peak hours definition
peak_hours = list(range(7, 11)) + list(range(16, 20))
# Categorize the data into weekday/weekend and peak/non-peak time
london_data['Weekday_Weekend'] = np.where(london_data['Day_of_Week'].isin(['Saturday', 'Sunday']), 'Weekend', 'Weekday')
london_data['Peak_NonPeak'] = np.where(london_data['Hour'].isin(peak_hours), 'Peak', 'Non-Peak')
# Create pivot tables for each combination of categories
pivot_table_weekday_peak = pd.pivot_table(london_data, index='Weekday_Weekend', columns='Peak_NonPeak', aggfunc='size', fill_value=0)
pivot_table_weekday_peak = pivot_table_weekday_peak.loc[['Weekday', 'Weekend'], ['Peak', 'Non-Peak']]
pivot_table_weekday_peak = pivot_table_weekday_peak // 2 # Annual average
df_for_plotly = pivot_table_weekday_peak.reset_index().melt(id_vars='Weekday_Weekend', var_name='Peak Time', value_name='Average Accidents')
# Use Plotly Express to create the heatmap
fig = px.imshow(pivot_table_weekday_peak,
labels=dict(x="Peak Time", y="Day Type", color="Average Number of Accidents"),
x=['Peak', 'Non-Peak'],
y=['Weekday', 'Weekend'],
text_auto=True, # This will annotate the heatmap with the value in each cell
color_continuous_scale='Blues') # Blue color scale for the heatmap
# Update the layout to add titles and outline
fig.update_layout(title='<b>Average Number of Accidents by Day Type and Peak Time</b>',
xaxis_title='<b>Peak Time</b>',
yaxis_title='<b>Day Type</b>',
width = 900,
height = 600,
plot_bgcolor='blue', # Add white background to the plot
paper_bgcolor='white') # Add white background to the paper
# Reverse the y-axis to match your original plot
fig.update_yaxes(autorange="reversed", tickangle=0)
# Show the figure
fig.update_xaxes(title_text='Peak Time') # Update x-axis label
fig.update_yaxes(title_text='Day Type') # Update y-axis label
fig.write_image("/Users/behemoth/Downloads/main_matrix.svg", engine="kaleido") # Export the figure to SVG using the Kaleido engine
fig.show()
We initially plotted Junction Control, Weather Conditions and Speed limit together across several heatmaps. Initial insights are that most accidnets occured around a speed limit fo 30 on days with fine and no high winds. This brings to light a counterintuitive analysis, that accidnets occur when people overlook their external surroundings when driving.
import matplotlib.colors as mcolors
# Mapping accident severity to numerical weights
severity_weights = {'Fatal': 3, 'Serious': 2, 'Slight': 1}
london_data['Severity_Weight'] = london_data['Accident_Severity'].map(severity_weights)
# Calculate weighted number of incidents
london_data['Weighted_Accidents'] = london_data['Number_of_Casualties'] * london_data['Severity_Weight']
# Create graphs and subgraphs
junction_controls = london_data['Junction_Control'].unique()
weather_conditions = london_data['Weather_Conditions'].unique()
speed_limits = list(range(0, 71, 10)) # from 0 to 70, with interval 10
fig, axs = plt.subplots(nrows=len(junction_controls), ncols=1, figsize=(15, 20))
# If there is only one 'Junction_Control', axs are not arrays but single objects
if len(junction_controls) == 1:
axs = [axs]
# Iterate through each Junction_Control
for i, junction in enumerate(junction_controls):
# Create sub-data sets
subset = london_data[london_data['Junction_Control'] == junction]
# Create pivot tables
pivot = subset.pivot_table(index='Weather_Conditions',
columns='Speed_limit',
values='Weighted_Accidents',
aggfunc='sum',
fill_value=0)
# Pivot table may not have columns for all speed limits, make sure each speed limit is present
pivot = pivot.reindex(columns=speed_limits, fill_value=0)
# heatmap
base_cmap = plt.cm.coolwarm
light_cmap = base_cmap(np.linspace(0.4, 1, base_cmap.N)) # start with the brighter colour colour by changing 0.4
light_cmap = mcolors.ListedColormap(light_cmap)
sns.heatmap(pivot, ax=axs[i], cmap='Blues', annot=True, fmt=".0f")
axs[i].set_title(f'Junction Control: {junction}')
axs[i].set_ylabel('Weather Conditions')
axs[i].set_xlabel('Speed Limit')
plt.style.use('seaborn-white')
plt.tight_layout()
plt.show()
/var/folders/zp/8608g3pd2td58x042d94m2sw0000gn/T/ipykernel_20671/2650802015.py:45: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
data = london_data
all_data = df
# Mapping accident severity to numerical weights
severity_weights = {'Fatal': 3, 'Serious': 2, 'Slight': 1}
all_data['Severity_Weight'] = all_data['Accident_Severity'].map(severity_weights)
# Calculate weighted number of incidents
all_data['Weighted_Accidents'] = all_data['Number_of_Casualties'] * all_data['Severity_Weight']
# Create graphs and subgraphs
urban_rural_areas = all_data['Urban_or_Rural_Area'].unique()
speed_limits = list(range(0, 71, 10)) # from 0 to 70, with interval 10
road_conditions = all_data['Road_Surface_Conditions'].unique()
fig, axs = plt.subplots(nrows=len(urban_rural_areas), ncols=1, figsize=(15, 10))
# If there is only one 'Junction_Control', axs are not arrays but single objects
if len(urban_rural_areas) == 1:
axs = [axs]
# Iterate through each Urban_or_Rural_Area
for i, area in enumerate(urban_rural_areas):
# Create sub-data sets
subset = all_data[all_data['Urban_or_Rural_Area'] == area]
# Create pivot tables
pivot = subset.pivot_table(index='Road_Surface_Conditions',
columns='Speed_limit',
values='Weighted_Accidents',
aggfunc='sum',
fill_value=0)
# Pivot table may not have columns for all speed limits, make sure each speed limit is present
pivot = pivot.reindex(columns=speed_limits, fill_value=0)
# heatmap
base_cmap = plt.cm.coolwarm
light_cmap = base_cmap(np.linspace(0.4, 1, base_cmap.N)) # start with the brighter colour by changing 0.4
light_cmap = mcolors.ListedColormap(light_cmap)
sns.heatmap(pivot, ax=axs[i], cmap='Blues', annot=True, fmt=".0f")
axs[i].set_title(f'Urban/Rural Area: {area}')
axs[i].set_ylabel('Road Surface Conditions')
axs[i].set_xlabel('Speed Limit')
plt.style.use('seaborn-white')
plt.tight_layout()
plt.show()
/var/folders/zp/8608g3pd2td58x042d94m2sw0000gn/T/ipykernel_20671/931135046.py:45: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
Moving towards developing a concise key static figure for this question, we normalise the accident count by weights of severity.
# heatmap of accident count for weather conditions vs junction control
# Define severity weights
severity_weights = {'Fatal': 3, 'Serious': 2, 'Slight': 1}
# Calculate weighted number of incidents
london_data['Severity_Weight'] = london_data['Accident_Severity'].map(severity_weights)
london_data['Weighted_Accidents'] = london_data['Number_of_Casualties'] * london_data['Severity_Weight']
# Classify weather conditions without considering the wind
def classify_weather(condition):
condition = str(condition) # Ensure the condition is a string to avoid errors with null values
if 'Fine' in condition:
return 'Fine'
elif 'Raining' in condition:
return 'Raining'
elif 'Snowing' in condition:
return 'Snowing'
elif 'Fog' in condition or condition == 'null' or condition == 'None':
return 'Other'
else:
return 'Other'
# Apply the classification
london_data['Weather_Category'] = london_data['Weather_Conditions'].apply(classify_weather)
# Group by Junction_Control and the new Weather_Category and calculate the total weighted accidents
weather_grouped = london_data.groupby(['Junction_Control', 'Weather_Category'])['Weighted_Accidents'].sum().reset_index()
# Calculate the normalized weighted accidents
weather_grouped['Normalized_Weighted_Accidents'] = weather_grouped['Weighted_Accidents'] / weather_grouped.groupby('Junction_Control')['Weighted_Accidents'].transform('sum')
# Use heat maps to present data
heatmap = alt.Chart(weather_grouped, title="Heatmap of Weighted Accidents by Junction_Control and Weather Categories").mark_rect().encode(
alt.X('Junction_Control:O', title="Junction Control"),
alt.Y('Weather_Category:O', title="Weather Category"),
alt.Color('Normalized_Weighted_Accidents:Q', title="Normalized Weighted Accidents", scale=alt.Scale(scheme='blues'))
).properties(
width=600,
height=400
).configure_view(
strokeWidth=0
)
heatmap
This code processes London road accident data by first filtering accidents based on geographical coordinates to focus on London. It assigns weights to accidents based on their severity—fatal, serious, and slight—to calculate a weighted number of incidents. The weather conditions are simplified into four categories—'Fine', 'Raining', 'Snowing', and 'Other' (including fog/mist and missing data)—ignoring wind conditions. These steps allow for grouping and normalizing weighted accident data by junction control and weather categories. The final output is a heatmap visualization that provides an intuitive comparison of accident severity across different junction controls and weather conditions, enabling a clearer understanding of how these factors correlate with road safety in London.
# Apply the weather classification
london_data['Weather_Category'] = london_data['Weather_Conditions'].apply(classify_weather)
# Group by Speed_limit and Weather_Category and calculate the total weighted accidents
grouped_data = london_data.groupby(['Speed_limit', 'Weather_Category'])['Weighted_Accidents'].sum().reset_index()
# Calculate the normalized weighted accidents
grouped_data['Normalized_Weighted_Accidents'] = grouped_data['Weighted_Accidents'] / grouped_data.groupby('Speed_limit')['Weighted_Accidents'].transform('sum')
# Use heat maps to present data
heatmap = alt.Chart(grouped_data, title="Heatmap of Weighted Accidents by Speed Limit and Weather Categories").mark_rect().encode(
alt.X('Speed_limit:O', title="Speed Limit"),
alt.Y('Weather_Category:O', title="Weather Category"),
alt.Color('Normalized_Weighted_Accidents:Q', title="Normalized Weighted Accidents", scale=alt.Scale(scheme='blues'))
).properties(
width=600,
height=400
).configure_view(
strokeWidth=0
)
heatmap
From a visualization perspective, we have showcased the impact of location, time, and external conditions on traffic accidents. However, to harness these valuable features to develop a product that can improve people's lives, we need to employ more sophisticated models.
We have utilized an Attentional Neural Network (ANN) model to successfully predict the severity of traffic accidents—although currently, the model sometimes misclassifies less severe accidents as more serious, such a bias towards safety is not detrimental to ensuring people's safety. Given the close relationship between accidents and the driver's condition, we plan to incorporate machine vision models developed by Google's top engineers in future product upgrades to further refine the application's algorithm, taking into account the driver's state. Currently, our app can indeed effectively alert users and prevent serious accidents. This attempt marks Google's initial step in integrating AI technology into daily life, with a commitment to continuously use technology to benefit humanity.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Multiply
from tensorflow.keras.regularizers import l2
2024-02-14 20:25:59.042127: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
This code snippet outlines a data preprocessing function for a road accident dataset. It begins by loading the dataset from a specified filepath using pandas. It then converts the 'Accident Date' and 'Time' columns into datetime objects to extract relevant temporal features such as hour, month, year, and day of the week, which can be critical for accident analysis. The original 'Accident Date' and 'Time' columns are dropped post-extraction to streamline the dataset.
Following this, the code handles categorical variables by encoding them using LabelEncoder, which transforms string labels into numerical format, making them suitable for machine learning models. It also stores mappings of these encodings to interpret model predictions later easily. For numerical columns, the snippet addresses missing values by replacing them with the median of their respective columns, a common practice to maintain data integrity.
Finally, the preprocessed data is saved to a new CSV file, and the function returns the path to this file along with the mappings of the encoded categorical variables. This preprocessing step is essential for transforming raw data into a format that's ready for analysis and modeling, ensuring that the dataset is clean, comprehensive, and conducive to generating accurate insights.
import pandas as pd
from sklearn.preprocessing import LabelEncoder
def preprocess_data(filepath):
# Load the dataset
data = pd.read_csv(filepath)
# Convert 'Accident Date' and 'Time' to datetime and extract features
data['Accident Date'] = pd.to_datetime(data['Accident Date'], errors='coerce')
data['Time'] = pd.to_datetime(data['Time'], format='%H:%M', errors='coerce').dt.time
data['Hour'] = pd.to_datetime(data['Time'].astype(str), errors='coerce').dt.hour
data['Month'] = data['Accident Date'].dt.month
data['Year'] = data['Accident Date'].dt.year
data['Day_of_Week'] = data['Accident Date'].dt.dayofweek
# Drop the original 'Accident Date' and 'Time' columns
data.drop(['Accident Date', 'Time'], axis=1, inplace=True)
# Encode categorical variables and save the mappings
label_encoders = {}
label_mappings = {}
for col in data.select_dtypes(include=['object']).columns.drop('Accident_Index'):
label_encoders[col] = LabelEncoder()
data[col] = label_encoders[col].fit_transform(data[col])
label_mappings[col] = {index: label for index, label in enumerate(label_encoders[col].classes_)}
# Handle missing numerical values
for col in data.select_dtypes(include=['int64', 'float64']).columns:
data[col].fillna(data[col].median(), inplace=True)
return data, label_mappings
# File path to your dataset
file_path = '/Users/behemoth/Downloads/Road Accident Data.csv'
# Apply preprocessing
preprocessed_data, category_mappings = preprocess_data(file_path)
# Save the preprocessed data to a new CSV file
preprocessed_filepath = 'Preprocessed_Road_Accident_Data.csv'
preprocessed_data.to_csv(preprocessed_filepath, index=False)
preprocessed_filepath, category_mappings
/var/folders/zp/8608g3pd2td58x042d94m2sw0000gn/T/ipykernel_20671/3528044443.py:11: UserWarning: Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.
('Preprocessed_Road_Accident_Data.csv',
{'Junction_Control': {0: 'Authorised person',
1: 'Auto traffic sigl',
2: 'Auto traffic signal',
3: 'Data missing or out of range',
4: 'Give way or uncontrolled',
5: 'Not at junction or within 20 metres',
6: 'Stop sign'},
'Junction_Detail': {0: 'Crossroads',
1: 'Mini-roundabout',
2: 'More than 4 arms (not roundabout)',
3: 'Not at junction or within 20 metres',
4: 'Other junction',
5: 'Private drive or entrance',
6: 'Roundabout',
7: 'Slip road',
8: 'T or staggered junction'},
'Accident_Severity': {0: 'Fatal', 1: 'Fetal', 2: 'Serious', 3: 'Slight'},
'Light_Conditions': {0: 'Darkness - lighting unknown',
1: 'Darkness - lights lit',
2: 'Darkness - lights unlit',
3: 'Darkness - no lighting',
4: 'Daylight'},
'Local_Authority_(District)': {0: 'Aberdeen City',
1: 'Aberdeenshire',
2: 'Adur',
3: 'Allerdale',
4: 'Alnwick',
5: 'Amber Valley',
6: 'Angus',
7: 'Argyll and Bute',
8: 'Arun',
9: 'Ashfield',
10: 'Ashford',
11: 'Aylesbury Vale',
12: 'Babergh',
13: 'Barking and Dagenham',
14: 'Barnet',
15: 'Barnsley',
16: 'Barrow-in-Furness',
17: 'Basildon',
18: 'Basingstoke and Deane',
19: 'Bassetlaw',
20: 'Bath and North East Somerset',
21: 'Bedford',
22: 'Berwick-upon-Tweed',
23: 'Bexley',
24: 'Birmingham',
25: 'Blaby',
26: 'Blackburn with Darwen',
27: 'Blackpool',
28: 'Blaenau Gwent',
29: 'Blaeu Gwent',
30: 'Blyth Valley',
31: 'Bolsover',
32: 'Bolton',
33: 'Boston',
34: 'Bournemouth',
35: 'Bracknell Forest',
36: 'Bradford',
37: 'Braintree',
38: 'Breckland',
39: 'Brent',
40: 'Brentwood',
41: 'Bridgend',
42: 'Bridgnorth',
43: 'Brighton and Hove',
44: 'Bristol, City of',
45: 'Broadland',
46: 'Bromley',
47: 'Bromsgrove',
48: 'Broxbourne',
49: 'Broxtowe',
50: 'Burnley',
51: 'Bury',
52: 'Caerphilly',
53: 'Calderdale',
54: 'Cambridge',
55: 'Camden',
56: 'Cannock Chase',
57: 'Canterbury',
58: 'Caradon',
59: 'Cardiff',
60: 'Carlisle',
61: 'Carmarthenshire',
62: 'Carrick',
63: 'Castle Morpeth',
64: 'Castle Point',
65: 'Central Bedfordshire',
66: 'Ceredigion',
67: 'Charnwood',
68: 'Chelmsford',
69: 'Cheltenham',
70: 'Cherwell',
71: 'Cheshire East',
72: 'Cheshire West and Chester',
73: 'Chester',
74: 'Chester-le-Street',
75: 'Chesterfield',
76: 'Chichester',
77: 'Chiltern',
78: 'Chorley',
79: 'Christchurch',
80: 'City of London',
81: 'Clackmannanshire',
82: 'Clackmannshire',
83: 'Colchester',
84: 'Congleton',
85: 'Conwy',
86: 'Copeland',
87: 'Corby',
88: 'Cornwall',
89: 'Cotswold',
90: 'County Durham',
91: 'Coventry',
92: 'Craven',
93: 'Crawley',
94: 'Crewe and Nantwich',
95: 'Crewe and ntwich',
96: 'Croydon',
97: 'Dacorum',
98: 'Darlington',
99: 'Dartford',
100: 'Daventry',
101: 'Denbighshire',
102: 'Derby',
103: 'Derbyshire Dales',
104: 'Derwentside',
105: 'Doncaster',
106: 'Dover',
107: 'Dudley',
108: 'Dumfries and Galloway',
109: 'Dundee City',
110: 'Durham',
111: 'Ealing',
112: 'Easington',
113: 'East Ayrshire',
114: 'East Cambridgeshire',
115: 'East Devon',
116: 'East Dorset',
117: 'East Dunbartonshire',
118: 'East Hampshire',
119: 'East Hertfordshire',
120: 'East Lindsey',
121: 'East Lothian',
122: 'East Northamptonshire',
123: 'East Renfrewshire',
124: 'East Riding of Yorkshire',
125: 'East Staffordshire',
126: 'Eastbourne',
127: 'Eastleigh',
128: 'Eden',
129: 'Edinburgh, City of',
130: 'Ellesmere Port and Neston',
131: 'Elmbridge',
132: 'Enfield',
133: 'Epping Forest',
134: 'Epsom and Ewell',
135: 'Erewash',
136: 'Exeter',
137: 'Falkirk',
138: 'Fareham',
139: 'Fenland',
140: 'Fife',
141: 'Flintshire',
142: 'Forest Heath',
143: 'Forest of Dean',
144: 'Fylde',
145: 'Gateshead',
146: 'Gedling',
147: 'Glasgow City',
148: 'Gloucester',
149: 'Gosport',
150: 'Gravesham',
151: 'Great Yarmouth',
152: 'Greenwich',
153: 'Guildford',
154: 'Gwynedd',
155: 'Hackney',
156: 'Halton',
157: 'Hambleton',
158: 'Hammersmith and Fulham',
159: 'Harborough',
160: 'Haringey',
161: 'Harlow',
162: 'Harrogate',
163: 'Harrow',
164: 'Hart',
165: 'Hartlepool',
166: 'Hastings',
167: 'Havant',
168: 'Havering',
169: 'Herefordshire, County of',
170: 'Hertsmere',
171: 'High Peak',
172: 'Highland',
173: 'Hillingdon',
174: 'Hinckley and Bosworth',
175: 'Horsham',
176: 'Hounslow',
177: 'Huntingdonshire',
178: 'Hyndburn',
179: 'Inverclyde',
180: 'Ipswich',
181: 'Isle of Anglesey',
182: 'Isle of Wight',
183: 'Islington',
184: 'Kennet',
185: 'Kensington and Chelsea',
186: 'Kerrier',
187: 'Kettering',
188: "King's Lynn and West Norfolk",
189: 'Kingston upon Hull, City of',
190: 'Kingston upon Thames',
191: 'Kirklees',
192: 'Knowsley',
193: 'Lambeth',
194: 'Lancaster',
195: 'Leeds',
196: 'Leicester',
197: 'Lewes',
198: 'Lewisham',
199: 'Lichfield',
200: 'Lincoln',
201: 'Liverpool',
202: 'London Airport (Heathrow)',
203: 'Luton',
204: 'Macclesfield',
205: 'Maidstone',
206: 'Maldon',
207: 'Malvern Hills',
208: 'Manchester',
209: 'Mansfield',
210: 'Medway',
211: 'Melton',
212: 'Mendip',
213: 'Merthyr Tydfil',
214: 'Merton',
215: 'Mid Bedfordshire',
216: 'Mid Devon',
217: 'Mid Suffolk',
218: 'Mid Sussex',
219: 'Middlesbrough',
220: 'Midlothian',
221: 'Milton Keynes',
222: 'Mole Valley',
223: 'Monmouthshire',
224: 'Moray',
225: 'Neath Port Talbot',
226: 'New Forest',
227: 'Newark and Sherwood',
228: 'Newcastle upon Tyne',
229: 'Newcastle-under-Lyme',
230: 'Newham',
231: 'Newport',
232: 'North Ayrshire',
233: 'North Cornwall',
234: 'North Devon',
235: 'North Dorset',
236: 'North East Derbyshire',
237: 'North East Lincolnshire',
238: 'North Hertfordshire',
239: 'North Kesteven',
240: 'North Lanarkshire',
241: 'North Larkshire',
242: 'North Lincolnshire',
243: 'North Norfolk',
244: 'North Shropshire',
245: 'North Somerset',
246: 'North Tyneside',
247: 'North Warwickshire',
248: 'North West Leicestershire',
249: 'North Wiltshire',
250: 'Northampton',
251: 'Northumberland',
252: 'Norwich',
253: 'Nottingham',
254: 'Nuneaton and Bedworth',
255: 'Oadby and Wigston',
256: 'Oldham',
257: 'Orkney Islands',
258: 'Oswestry',
259: 'Oxford',
260: 'Pembrokeshire',
261: 'Pendle',
262: 'Penwith',
263: 'Perth and Kinross',
264: 'Peterborough',
265: 'Plymouth',
266: 'Poole',
267: 'Portsmouth',
268: 'Powys',
269: 'Preston',
270: 'Purbeck',
271: 'Reading',
272: 'Redbridge',
273: 'Redcar and Cleveland',
274: 'Redditch',
275: 'Reigate and Banstead',
276: 'Renfrewshire',
277: 'Restormel',
278: 'Rhondda, Cynon, Taff',
279: 'Ribble Valley',
280: 'Richmond upon Thames',
281: 'Richmondshire',
282: 'Rochdale',
283: 'Rochford',
284: 'Rossendale',
285: 'Rother',
286: 'Rotherham',
287: 'Rugby',
288: 'Runnymede',
289: 'Rushcliffe',
290: 'Rushmoor',
291: 'Rutland',
292: 'Ryedale',
293: 'Salford',
294: 'Salisbury',
295: 'Sandwell',
296: 'Scarborough',
297: 'Scottish Borders',
298: 'Sedgefield',
299: 'Sedgemoor',
300: 'Sefton',
301: 'Selby',
302: 'Sevenoaks',
303: 'Sheffield',
304: 'Shepway',
305: 'Shetland Islands',
306: 'Shrewsbury and Atcham',
307: 'Shropshire',
308: 'Slough',
309: 'Solihull',
310: 'South Ayrshire',
311: 'South Bedfordshire',
312: 'South Bucks',
313: 'South Cambridgeshire',
314: 'South Derbyshire',
315: 'South Gloucestershire',
316: 'South Hams',
317: 'South Holland',
318: 'South Kesteven',
319: 'South Lakeland',
320: 'South Lanarkshire',
321: 'South Larkshire',
322: 'South Norfolk',
323: 'South Northamptonshire',
324: 'South Oxfordshire',
325: 'South Ribble',
326: 'South Shropshire',
327: 'South Somerset',
328: 'South Staffordshire',
329: 'South Tyneside',
330: 'Southampton',
331: 'Southend-on-Sea',
332: 'Southwark',
333: 'Spelthorne',
334: 'St. Albans',
335: 'St. Edmundsbury',
336: 'St. Helens',
337: 'Stafford',
338: 'Staffordshire Moorlands',
339: 'Stevege',
340: 'Stevenage',
341: 'Stirling',
342: 'Stockport',
343: 'Stockton-on-Tees',
344: 'Stoke-on-Trent',
345: 'Stratford-upon-Avon',
346: 'Stroud',
347: 'Suffolk Coastal',
348: 'Sunderland',
349: 'Surrey Heath',
350: 'Sutton',
351: 'Swale',
352: 'Swansea',
353: 'Swindon',
354: 'Tameside',
355: 'Tamworth',
356: 'Tandridge',
357: 'Taunton Deane',
358: 'Teesdale',
359: 'Teignbridge',
360: 'Telford and Wrekin',
361: 'Tendring',
362: 'Test Valley',
363: 'Tewkesbury',
364: 'Thanet',
365: 'The Vale of Glamorgan',
366: 'Three Rivers',
367: 'Thurrock',
368: 'Tonbridge and Malling',
369: 'Torbay',
370: 'Torfaen',
371: 'Torridge',
372: 'Tower Hamlets',
373: 'Trafford',
374: 'Tunbridge Wells',
375: 'Tynedale',
376: 'Uttlesford',
377: 'Vale Royal',
378: 'Vale of White Horse',
379: 'Wakefield',
380: 'Walsall',
381: 'Waltham Forest',
382: 'Wandsworth',
383: 'Wansbeck',
384: 'Warrington',
385: 'Warwick',
386: 'Watford',
387: 'Waveney',
388: 'Waverley',
389: 'Wealden',
390: 'Wear Valley',
391: 'Wellingborough',
392: 'Welwyn Hatfield',
393: 'West Berkshire',
394: 'West Devon',
395: 'West Dorset',
396: 'West Dunbartonshire',
397: 'West Lancashire',
398: 'West Lindsey',
399: 'West Lothian',
400: 'West Oxfordshire',
401: 'West Somerset',
402: 'West Wiltshire',
403: 'Western Isles',
404: 'Westminster',
405: 'Weymouth and Portland',
406: 'Wigan',
407: 'Wiltshire',
408: 'Winchester',
409: 'Windsor and Maidenhead',
410: 'Wirral',
411: 'Woking',
412: 'Wokingham',
413: 'Wolverhampton',
414: 'Worcester',
415: 'Worthing',
416: 'Wrexham',
417: 'Wychavon',
418: 'Wycombe',
419: 'Wyre',
420: 'Wyre Forest',
421: 'York'},
'Carriageway_Hazards': {0: 'Any animal in carriageway (except ridden horse)',
1: 'Other object on road',
2: 'Pedestrian in carriageway - not injured',
3: 'Previous accident',
4: 'Vehicle load on road',
5: nan},
'Police_Force': {0: 'Avon and Somerset',
1: 'Bedfordshire',
2: 'Cambridgeshire',
3: 'Central',
4: 'Cheshire',
5: 'City of London',
6: 'Cleveland',
7: 'Cumbria',
8: 'Derbyshire',
9: 'Devon and Cornwall',
10: 'Dorset',
11: 'Dumfries and Galloway',
12: 'Durham',
13: 'Dyfed-Powys',
14: 'Essex',
15: 'Fife',
16: 'Gloucestershire',
17: 'Grampian',
18: 'Greater Manchester',
19: 'Gwent',
20: 'Hampshire',
21: 'Hertfordshire',
22: 'Humberside',
23: 'Kent',
24: 'Lancashire',
25: 'Leicestershire',
26: 'Lincolnshire',
27: 'Lothian and Borders',
28: 'Merseyside',
29: 'Metropolitan Police',
30: 'Norfolk',
31: 'North Wales',
32: 'North Yorkshire',
33: 'Northamptonshire',
34: 'Northern',
35: 'Northumbria',
36: 'Nottinghamshire',
37: 'South Wales',
38: 'South Yorkshire',
39: 'Staffordshire',
40: 'Strathclyde',
41: 'Suffolk',
42: 'Surrey',
43: 'Sussex',
44: 'Tayside',
45: 'Thames Valley',
46: 'Warwickshire',
47: 'West Mercia',
48: 'West Midlands',
49: 'West Yorkshire',
50: 'Wiltshire'},
'Road_Surface_Conditions': {0: 'Dry',
1: 'Flood over 3cm. deep',
2: 'Frost or ice',
3: 'Snow',
4: 'Wet or damp',
5: nan},
'Road_Type': {0: 'Dual carriageway',
1: 'One way street',
2: 'Roundabout',
3: 'Single carriageway',
4: 'Slip road',
5: nan},
'Urban_or_Rural_Area': {0: 'Rural', 1: 'Urban'},
'Weather_Conditions': {0: 'Fine + high winds',
1: 'Fine no high winds',
2: 'Fog or mist',
3: 'Other',
4: 'Raining + high winds',
5: 'Raining no high winds',
6: 'Snowing + high winds',
7: 'Snowing no high winds',
8: nan},
'Vehicle_Type': {0: 'Agricultural vehicle',
1: 'Bus or coach (17 or more pass seats)',
2: 'Car',
3: 'Goods 7.5 tonnes mgw and over',
4: 'Goods over 3.5t. and under 7.5t',
5: 'Minibus (8 - 16 passenger seats)',
6: 'Motorcycle 125cc and under',
7: 'Motorcycle 50cc and under',
8: 'Motorcycle over 125cc and up to 500cc',
9: 'Motorcycle over 500cc',
10: 'Other vehicle',
11: 'Pedal cycle',
12: 'Ridden horse',
13: 'Taxi/Private hire car',
14: 'Van / Goods 3.5 tonnes mgw or under'}})
This code snippet evaluates our ANN model's performance on classifying road accident severity. It starts by converting the model's probabilistic predictions into binary labels using a 0.5 threshold, distinguishing between 'Light' and 'Severe' accidents. A confusion matrix is then generated and visualized as a heatmap with Seaborn, clearly marking the counts of true positives, true negatives, false positives, and false negatives with appropriate labels for easy interpretation.
Subsequently, the snippet calculates sensitivity (or recall) and specificity, key metrics that reveal the model's ability to correctly identify severe accidents and distinguish them from lighter ones. These metrics are printed out, providing a quick insight into the model's diagnostic accuracy. This concise evaluation method highlights the model's effectiveness in practical terms, guiding further refinements.
# Load data, exclude unwanted columns
file_path = 'Preprocessed_Road_Accident_Data.csv'
columns_to_drop = ['Accident_Index', 'Number_of_Casualties', 'Number_of_Vehicles', 'Police_Force']
data = pd.read_csv(file_path).drop(columns=columns_to_drop)
# Extract features and labels
X = data.drop('Accident_Severity', axis=1)
y = data['Accident_Severity'].apply(lambda x: 0 if x == 3 else 1)
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_scaled, y_train)
# Adjusting class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train_sm), y=y_train_sm)
class_weights_dict = {0: class_weights[0], 1: class_weights[1]*1.9}
def attention_layer(inputs, name='attention_vec'):
input_dim = int(inputs.shape[1])
attention_scores = Dense(input_dim, activation='softmax', name=name)(inputs)
weighted_inputs = Multiply()([inputs, attention_scores])
return weighted_inputs
# Model definition using Functional API
input_layer = Input(shape=(X_train_sm.shape[1],))
attention_output = attention_layer(input_layer)
x = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(attention_output)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
x = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(x)
output_layer = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[input_layer], outputs=[output_layer])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Model training
model.fit(X_train_sm, y_train_sm, epochs=10, batch_size=64, validation_split=0.2, class_weight=class_weights_dict)
# Making predictions
y_pred = model.predict(X_test_scaled).flatten()
y_pred_classes = np.where(y_pred > 0.5, 1, 0)
# Performance evaluation
accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes)
recall = recall_score(y_test, y_pred_classes)
f1 = f1_score(y_test, y_pred_classes)
# Printing performance metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
Epoch 1/10 5266/5266 [==============================] - 55s 10ms/step - loss: 1.2709 - accuracy: 0.4873 - val_loss: 0.6448 - val_accuracy: 0.8299 Epoch 2/10 5266/5266 [==============================] - 58s 11ms/step - loss: 0.9241 - accuracy: 0.5163 - val_loss: 0.5938 - val_accuracy: 0.7940 Epoch 3/10 5266/5266 [==============================] - 51s 10ms/step - loss: 0.9171 - accuracy: 0.5279 - val_loss: 0.5915 - val_accuracy: 0.6787 Epoch 4/10 5266/5266 [==============================] - 45s 9ms/step - loss: 0.9141 - accuracy: 0.5346 - val_loss: 0.5986 - val_accuracy: 0.7169 Epoch 5/10 5266/5266 [==============================] - 47s 9ms/step - loss: 0.9120 - accuracy: 0.5362 - val_loss: 0.6170 - val_accuracy: 0.6549 Epoch 6/10 5266/5266 [==============================] - 47s 9ms/step - loss: 0.9104 - accuracy: 0.5369 - val_loss: 0.5921 - val_accuracy: 0.7801 Epoch 7/10 5266/5266 [==============================] - 43s 8ms/step - loss: 0.9101 - accuracy: 0.5381 - val_loss: 0.6093 - val_accuracy: 0.8006 Epoch 8/10 5266/5266 [==============================] - 50s 9ms/step - loss: 0.9094 - accuracy: 0.5384 - val_loss: 0.6058 - val_accuracy: 0.7317 Epoch 9/10 5266/5266 [==============================] - 46s 9ms/step - loss: 0.9086 - accuracy: 0.5384 - val_loss: 0.5969 - val_accuracy: 0.7121 Epoch 10/10 5266/5266 [==============================] - 44s 8ms/step - loss: 0.9088 - accuracy: 0.5378 - val_loss: 0.6549 - val_accuracy: 0.5715 1925/1925 [==============================] - 7s 3ms/step Accuracy: 0.5875314554752821 Precision: 0.18211652059766031 Recall: 0.5276876608121713 F1 Score: 0.2707807118254879
This code demonstrates the process of constructing and training a deep learning model to classify the severity of road accidents. It starts with loading and preprocessing the dataset using pandas, which involves removing unnecessary columns and separating features from the label. The dataset is then divided into training and testing sets using train_test_split, followed by feature scaling with StandardScaler and addressing class imbalance with SMOTE. The model is built using TensorFlow's Keras API, incorporating a custom attention mechanism layer that allows the model to weigh input features differently based on their importance. Additionally, the architecture includes multiple dense layers, batch normalization, and dropout layers to improve training outcomes and prevent overfitting. Class weights are adjusted to further tackle the issue of class imbalance. After training, predictions are made on the test set, and several key performance metrics, such as accuracy, precision, recall, and F1 score, are calculated to assess the model's effectiveness. This workflow not only covers model construction and training but also emphasizes crucial preprocessing and postprocessing steps, showcasing a comprehensive approach to a machine learning project from start to finish.
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Need to convert probabilities to binary labels
y_pred_classes = (y_pred > 0.5).astype(int)
# Generate confusion matrices
conf_matrix = confusion_matrix(y_test, y_pred_classes)
# Use Seaborn to visualise confusion matrices
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Light', 'Severe'], yticklabels=['Light', 'Severe'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
# Calculate sensitivity and specificity
TN, FP, FN, TP = conf_matrix.ravel()
sensitivity = TP / float(TP + FN)
specificity = TN / float(TN + FP)
print(f"Sensitivity (Recall or True Positive Rate): {sensitivity:.2f}")
print(f"Specificity (True Negative Rate): {specificity:.2f}")
Sensitivity (Recall or True Positive Rate): 0.53 Specificity (True Negative Rate): 0.60